{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c5688685-cfb9-41e5-b0bb-27ac87757ada",
   "metadata": {},
   "outputs": [],
   "source": [
    "# download presidio\n",
    "!pip install presidio_analyzer presidio_anonymizer"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cb8e0bdb-3138-44ad-8d87-d9d549c51ce5",
   "metadata": {},
   "source": [
    "###### Path to notebook: [https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/ner_model_configuration.ipynb](https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/ner_model_configuration.ipynb)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4b7961b8-34ab-40fd-8672-2f988f115c17",
   "metadata": {},
   "source": [
    "# Configuring the NER model\n",
    "\n",
    "This notebook contains a few examples to customize and configure the NER model through code.\n",
    "Examples:\n",
    "1. Changing the default model's parameters\n",
    "2. Using Stanza as the NER engine\n",
    "3. Using transformers as the NER engine\n",
    "4. Supporting multiple languages\n",
    "\n",
    "This notebook complements the documentation, which primarily focuses on reading the NER configuration from file"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bbe0e30e-b040-4c72-9743-eba443529084",
   "metadata": {},
   "source": [
    "### 1. Changing the default model's parameters\n",
    "\n",
    "In this example, we'll change the models' default confidence score (spaCy models do not generally output confidence per prediction, so we add a default score(. In addition, we'll change the types of PII entities the model returns."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "130eb964-e141-4ebd-999f-ccce659d2adb",
   "metadata": {},
   "outputs": [],
   "source": [
    "from presidio_analyzer import AnalyzerEngine\n",
    "from presidio_analyzer.nlp_engine import NlpEngine, SpacyNlpEngine, NerModelConfiguration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "dabc2b20-6e83-48e5-a2c9-1398aec8c456",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define which model to use\n",
    "model_config = [{\"lang_code\": \"en\", \"model_name\": \"en_core_web_lg\"}]\n",
    "\n",
    "# Define which entities the model returns and how they map to Presidio's\n",
    "entity_mapping = dict(\n",
    "    PER=\"PERSON\",\n",
    "    LOC= \"LOCATION\",\n",
    "    GPE=\"LOCATION\",\n",
    "    ORG=\"ORGANIZATION\"\n",
    ")\n",
    "\n",
    "ner_model_configuration = NerModelConfiguration(default_score = 0.6, \n",
    "                                                model_to_presidio_entity_mapping=entity_mapping)\n",
    "\n",
    "# Create the NLP Engine based on this configuration\n",
    "spacy_nlp_engine = SpacyNlpEngine(models= model_config, ner_model_configuration=ner_model_configuration)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "7e11eee9-4ac4-4533-960c-8c18509fb7e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Helper method to use the NLP Engine as part of Presidio Analyzer, and print configuration+results\n",
    "\n",
    "def call_analyzer_and_print_results(nlp_engine: NlpEngine,\n",
    "                                    language: str = \"en\",\n",
    "                                    text: str = \"Bill Clinton used to be the president of the United States\") -> None:\n",
    "    \"\"\"\n",
    "    Instantiate the AnalyzerEngine with the provided nlp_engine and return output.\n",
    "\n",
    "    This method creates an AnalyzerEngine instance with the provided NlpEngine, and three supported languages (en, es, de)\n",
    "    Then, it calls the analyze method to return identified PII.\n",
    "\n",
    "    :param nlp_engine: The NlpEngine instance as configured by the user\n",
    "    :param language: the language the request should support (in contrast to the AnalyzerEngine which can support multiple)\n",
    "    :param text: The text to look for PII entities in.\n",
    "\n",
    "    \"\"\"\n",
    "    \n",
    "    print(f\"Input text:\\n\\t{text}\\n\")\n",
    "    \n",
    "    # Initialize the AnalyzerEngine with the configured Nlp Engine:\n",
    "    analyzer = AnalyzerEngine(nlp_engine=nlp_engine, \n",
    "                              supported_languages=[\"en\", \"de\", \"es\"])\n",
    "\n",
    "    # Print the NLP Engine's configuration\n",
    "    print(f\"NLP Engine configuration:\\n\\tLoaded NLP engine: {analyzer.nlp_engine.__class__.__name__}\")\n",
    "    print(f\"\\tSupported entities: {analyzer.nlp_engine.get_supported_entities()}\")\n",
    "    print(f\"\\tSupported languages: {analyzer.nlp_engine.get_supported_languages()}\")\n",
    "    print()\n",
    "    \n",
    "    # Call the analyzer.analyze to detect PII entities (from the NLP engine + all other recognizers)\n",
    "    results = analyzer.analyze(text=text, \n",
    "                               language=language, \n",
    "                               return_decision_process=True)\n",
    "\n",
    "    # sort results\n",
    "    results = sorted(results, key= lambda x: x.start)\n",
    "    \n",
    "    # Print results\n",
    "    print(\"Returning full results, including the decision process:\")\n",
    "    for i, result in enumerate(results):\n",
    "        print(f\"\\tResult {i}: {result}\")\n",
    "        print(f\"\\tDetected text: {text[result.start: result.end]}\")\n",
    "        print(f\"\\t{result.analysis_explanation.textual_explanation}\")\n",
    "        print(\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "1004a788-8111-4793-b7b5-e7759358ab2d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Input text:\n",
      "\tBill Clinton used to be the president of the United States\n",
      "\n",
      "NLP Engine configuration:\n",
      "\tLoaded NLP engine: SpacyNlpEngine\n",
      "\tSupported entities: ['LOCATION', 'PERSON', 'ORGANIZATION']\n",
      "\tSupported languages: ['en']\n",
      "\n",
      "Returning full results, including the decision process:\n",
      "\tResult 0: type: PERSON, start: 0, end: 12, score: 0.6\n",
      "\tDetected text: Bill Clinton\n",
      "\tIdentified as PERSON by Spacy's Named Entity Recognition\n",
      "\n",
      "\tResult 1: type: LOCATION, start: 41, end: 58, score: 0.6\n",
      "\tDetected text: the United States\n",
      "\tIdentified as LOCATION by Spacy's Named Entity Recognition\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Run it as part of Presidio's AnalyzerEngine\n",
    "call_analyzer_and_print_results(spacy_nlp_engine)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "394f8f23-15e7-4767-8298-17170fa2d316",
   "metadata": {},
   "source": [
    "## 2. Using Stanza"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6491fa00-4229-4792-9120-ffe601a1cb2f",
   "metadata": {},
   "source": [
    "Stanza is an NLP package by Stanford. More details on Stanza can be found here: https://stanfordnlp.github.io/stanza/\n",
    "Loading Stanza instead of spaCy is straightforward. Just use `StanzaNlpEngine` instead of `SpacyNlpEngine` and define a model name supported by stanza (for example, `en` instead of `en_core_web_lg`)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9e660475-0043-4a2e-a198-9eca1cb307a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "from presidio_analyzer.nlp_engine import StanzaNlpEngine, NerModelConfiguration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a29cbd9-8298-494f-812b-fb5d3a460ed0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define which model to use\n",
    "model_config = [{\"lang_code\": \"en\", \"model_name\": \"en\"}]\n",
    "\n",
    "# Define which entities the model returns and how they map to Presidio's\n",
    "entity_mapping = dict(\n",
    "    PER=\"PERSON\",\n",
    "    LOC= \"LOCATION\",\n",
    "    GPE=\"LOCATION\",\n",
    "    ORG=\"ORGANIZATION\"\n",
    ")\n",
    "\n",
    "ner_model_configuration = NerModelConfiguration(model_to_presidio_entity_mapping=entity_mapping)\n",
    "\n",
    "# Create the Stanza NLP Engine based on this configuration\n",
    "stanza_nlp_engine = StanzaNlpEngine(models= model_config, ner_model_configuration=ner_model_configuration)\n",
    "\n",
    "# Run it as part of Presidio's AnalyzerEngine\n",
    "call_analyzer_and_print_results(stanza_nlp_engine)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c118badd-c924-4b3b-ad6a-24cad1c5ffec",
   "metadata": {},
   "source": [
    "## 3. Using transformers as the NLP engine"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5c27a1f8-5eb6-4b98-bc67-de7470f5786c",
   "metadata": {},
   "source": [
    "A third option is to use a model based on the `transformers` package. Note that in this case, we use both spaCy and transformers. The actual PII entities are detected using a transformers model, but additional text features such as lemmas and others, are extracted from a spaCy pipeline. We use a small spaCy model as it's faster and more memory efficient."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "78e9e9a1-743b-4764-add1-8963b6116460",
   "metadata": {},
   "outputs": [],
   "source": [
    "from presidio_analyzer.nlp_engine import TransformersNlpEngine, NerModelConfiguration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf14cce2-f593-43f2-9d3e-12a99f0cbbbe",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define which model to use\n",
    "model_config = [{\n",
    "   \"lang_code\":\"en\",\n",
    "   \"model_name\":{\n",
    "      \"spacy\":\"en_core_web_sm\",\n",
    "      \"transformers\":\"obi/deid_roberta_i2b2\"\n",
    "   }\n",
    "}]\n",
    "\n",
    "# Map transformers model labels to Presidio's\n",
    "model_to_presidio_entity_mapping = dict(\n",
    "    PER=\"PERSON\",\n",
    "    PERSON=\"PERSON\",\n",
    "    LOC= \"LOCATION\",\n",
    "    LOCATION= \"LOCATION\",\n",
    "    GPE=\"LOCATION\",\n",
    "    ORG=\"ORGANIZATION\",\n",
    "    ORGANIZATION=\"ORGANIZATION\",\n",
    "    NORP=\"NRP\",\n",
    "    AGE=\"AGE\",\n",
    "    ID=\"ID\",\n",
    "    EMAIL=\"EMAIL\",\n",
    "    PATIENT=\"PERSON\",\n",
    "    STAFF=\"PERSON\",\n",
    "    HOSP=\"ORGANIZATION\",\n",
    "    PATORG=\"ORGANIZATION\",\n",
    "    DATE=\"DATE_TIME\",\n",
    "    TIME=\"DATE_TIME\",\n",
    "    PHONE=\"PHONE_NUMBER\",\n",
    "    HCW=\"PERSON\",\n",
    "    HOSPITAL=\"ORGANIZATION\",\n",
    "    FACILITY=\"LOCATION\",\n",
    ")\n",
    "\n",
    "ner_model_configuration = NerModelConfiguration(model_to_presidio_entity_mapping=model_to_presidio_entity_mapping, \n",
    "                                                aggregation_strategy=\"simple\",\n",
    "                                                stride=14)\n",
    "\n",
    "transformers_nlp_engine = TransformersNlpEngine(models=model_config,\n",
    "                                                ner_model_configuration=ner_model_configuration)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5ef5566d-90d9-484e-9e09-7625ba60cffb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run it as part of Presidio's AnalyzerEngine\n",
    "call_analyzer_and_print_results(transformers_nlp_engine)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13dd60ab-a44e-4951-9159-e593d5a17d27",
   "metadata": {},
   "source": [
    "## 4. Supporting multiple languages\n",
    "Presidio allows the user to create a model per language:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d52af66-5764-464b-9476-2868cf7f6851",
   "metadata": {},
   "outputs": [],
   "source": [
    "from presidio_analyzer.nlp_engine import TransformersNlpEngine, NerModelConfiguration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "65e059da-5d10-491f-99da-beaf3e2e58d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define which model to use\n",
    "model_config = [{\n",
    "   \"lang_code\":\"en\",\n",
    "   \"model_name\":{\n",
    "      \"spacy\":\"en_core_web_sm\",\n",
    "      \"transformers\":\"obi/deid_roberta_i2b2\"\n",
    "   }\n",
    "},\n",
    "{\n",
    "    \"lang_code\":\"es\",\n",
    "    \"model_name\":{\n",
    "      \"spacy\":\"es_core_news_sm\",\n",
    "      \"transformers\":\"PlanTL-GOB-ES/roberta-large-bne-capitel-ner\"\n",
    "   }\n",
    "}]\n",
    "\n",
    "transformers_nlp_engine = TransformersNlpEngine(models=model_config,\n",
    "                                                ner_model_configuration=ner_model_configuration)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8f2020e8-adab-4ff8-8f16-0d6200dbae71",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Call in English\n",
    "call_analyzer_and_print_results(transformers_nlp_engine, \n",
    "                                language=\"en\", \n",
    "                                text = \"Bill Clinton was the president of the United States\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e9fe8668-ae84-4b68-b048-fe1b346726aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Call in Spanish\n",
    "call_analyzer_and_print_results(transformers_nlp_engine, \n",
    "                                language=\"es\", \n",
    "                                text = \"Bill Clinton solía ser el presidente de los Estados Unidos.\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "presidio_e2e",
   "language": "python",
   "name": "presidio"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
